# Read Anurag's processed data, and create YearSeason variable (from Charles)
rm(list=ls())
Error in names(frame) <- `*vtmp*` : names() applied to a non-vector
matches = read.csv('/Users/pierlim/R_Projects/processed/matches.csv')
matches$YearSeason <- ifelse(matches$Season == "Spring", paste("01/01/", as.character(matches$Year), sep=""), paste("01/06/", as.character(matches$Year), sep=""))
Get the team efficiency (matches won / total matches played)
library(data.table)
library(ggplot2)
# Total matches played by each team (consider them as blue or red)
blueTeams = data.frame(table(matches$blueTeamTag))
redTeams = data.frame(table(matches$redTeamTag))
names(blueTeams)[names(blueTeams) == 'Var1'] <- 'Team'
names(redTeams)[names(redTeams) == 'Var1'] <- 'Team'
combinedPlayed <- merge(blueTeams, redTeams, by="Team", all=TRUE)
setnames(combinedPlayed, "Freq.y", "rMatchesPlayed")
setnames(combinedPlayed, "Freq.x", "bMatchesPlayed")
combinedPlayed$totalPlayed <- combinedPlayed$bMatchesPlayed + combinedPlayed$rMatchesPlayed
# Get each team wins, whether they won as blue or red
bluewin <- subset(matches, matches$bResult==1)
redwin <- subset(matches, matches$rResult==1)
blueTeamWin = data.frame(table(bluewin$blueTeamTag))
redTeamWin = data.frame(table(redwin$redTeamTag))
names(blueTeamWin)[names(blueTeamWin) == 'Var1'] <- 'Team'
names(redTeamWin)[names(redTeamWin) == 'Var1'] <- 'Team'
combinedWins <- merge(blueTeamWin, redTeamWin, by="Team", all=TRUE)
setnames(combinedWins, "Freq.y", "rMatchesWon")
setnames(combinedWins, "Freq.x", "bMatchesWon")
combinedWins$totalWon <- combinedWins$bMatchesWon + combinedWins$rMatchesWon
# Merge and calc the win efficiency of each team
combinedTeams <- merge(combinedWins, combinedPlayed, by="Team")
combinedTeams$winEfficiency = combinedTeams$totalWon / combinedTeams$totalPlayed
combinedTeams[is.na(combinedTeams)] <- 0
# Plotting top 20 just for visualization
top20efficient <- (combinedTeams[with(combinedTeams, order(-combinedTeams$winEfficiency)), ])[1:20, ]
ggplot(top20efficient, aes(x=Team, y=winEfficiency)) + geom_bar(stat="identity") +ggtitle("Top 20 Teams Most Efficient At Winning") + theme(plot.title = element_text(hjust = 0.5))
Personally, I don’t feel this is an extremely accurate value as it could be that the team played 10 games and won them all. Eg SSW who played 17 games and won 15. Contrast that to SKT, who played many games, and naturally did not win them all. As such, I don’t think it’s necessary to group the win efficiency by season.
That said, it is good enough to be used as a predictor as SKT, the undisputed champion is among the top 10 in win efficiency.
Put this new derived value back into processed -> matches for both blueTeamTag and redTeamTag.
combinedTeams_blue = combinedTeams[,-c(2:7)] # remove unecessary columns
setnames(combinedTeams_blue, "winEfficiency", "bWinEfficiency")
setnames(combinedTeams_blue, "Team", "blueTeamTag")
matches <- merge(combinedTeams_blue, matches, by="blueTeamTag")
combinedTeams_red = combinedTeams[,-c(2:7)] # remove unecessary columns
setnames(combinedTeams_red, "winEfficiency", "rWinEfficiency")
setnames(combinedTeams_red, "Team", "redTeamTag")
matches <- merge(combinedTeams_red, matches, by="redTeamTag")
head(matches)